{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from keras.preprocessing import text, sequence\n",
    "from sklearn.model_selection import StratifiedShuffleSplit\n",
    "\n",
    "np.random.seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "INPUT_PATH = '../input/'\n",
    "CACHE_PATH = '../cache/'\n",
    "OUTPUT_PATH ='../output/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 读取数据\n",
    "df_train = pd.read_csv(CACHE_PATH + 'train_processed_all.csv')\n",
    "df_predict = pd.read_csv(CACHE_PATH + 'predict_processed_all.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_train = df_train[df_train['Score'].notnull()].reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5.0    0.605600\n",
       "4.0    0.280627\n",
       "3.0    0.099018\n",
       "2.0    0.009191\n",
       "1.0    0.005564\n",
       "Name: Score, dtype: float64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train['Score'].value_counts()/df_train.shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "MAX_FEATURES = 20000 \n",
    "MAX_LEN = 300 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_X  = df_train['Discuss'].astype(str)\n",
    "train_y = df_train['Score'].astype(float)\n",
    "predict_X  = df_predict['Discuss'].astype(str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "tokenizer = text.Tokenizer(num_words=MAX_FEATURES)\n",
    "tokenizer.fit_on_texts(train_X)\n",
    "train_X = tokenizer.texts_to_sequences(train_X)\n",
    "predict_X = tokenizer.texts_to_sequences(predict_X)\n",
    "train_X = sequence.pad_sequences(train_X, maxlen=MAX_LEN)\n",
    "predict_X = sequence.pad_sequences(predict_X, maxlen=MAX_LEN)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(50000, 300)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predict_X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 分层划分数据集\n",
    "def split_data_set(X,y,seed=42):\n",
    "    split = StratifiedShuffleSplit(n_splits=1,test_size=0.05,random_state=seed)\n",
    "    for train_index,val_index in split.split(X,y):\n",
    "        strat_train_X = X[train_index]\n",
    "        strat_val_X = X[val_index]\n",
    "        strat_train_y = y[train_index]\n",
    "        strat_val_y = y[val_index]\n",
    "    return strat_train_X,strat_val_X,strat_train_y,strat_val_y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train, X_val, y_train, y_val = split_data_set(train_X, train_y)\n",
    "np.savez(CACHE_PATH + 'data.npz',\n",
    "         X_train=X_train,\n",
    "         X_val=X_val,\n",
    "         y_train=y_train,\n",
    "         y_val=y_val,\n",
    "         X_test=predict_X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "EMBEDDING_FILE = CACHE_PATH + 'chinese_vector_300.txt'\n",
    "EMBEDDING_SIZE = 300"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_coefs(word, *arr):\n",
    "    return word, np.asarray(arr, dtype='float32')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(EMBEDDING_FILE) as f:\n",
    "    embedding_index = dict(get_coefs(*o.strip().split()) for o in f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_embedding_matrix(embedding_index):\n",
    "    word_index = tokenizer.word_index\n",
    "    nb_words = min(MAX_FEATURES,len(word_index)) \n",
    "    # 生成初始词向量矩阵\n",
    "    embedding = np.asarray(list(embedding_index.values()))\n",
    "    embedding_mean, embeddiing_std = embedding.mean(),embedding.std()\n",
    "    embedding_matrix = np.random.normal(embedding_mean,embeddiing_std,(nb_words,EMBEDDING_SIZE))\n",
    "    # 填充已有的词向量\n",
    "    for word, i in word_index.items():\n",
    "        if i < MAX_FEATURES:\n",
    "            embedding_vector = embedding_index.get(word)\n",
    "            if embedding_vector is not None:\n",
    "                embedding_matrix[i] = embedding_vector\n",
    "    return embedding_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "embedding_matrix = get_embedding_matrix(embedding_index)\n",
    "np.save(CACHE_PATH + 'embedding_matrix.npy', embedding_matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
